# Replication materials for Clause Analysis (Van Atteveldt et al., Political Analysis)
#
# Code to create the figures and tables from the ONLINE APPENDIX: ANALYSIS OF THE GAZA WAR COMPARED TO BASELINE
# This code requires the parsed texts from tokens.rds.
# Unfortunately, this file cannot be shared due to copyright restrictions.
# Please see get_tokens.r on how to recreate this file (requires access to the AmCAT database)

source("functions.r")

tokens = readRDS("tokens.rds")


# make tokens.sentence unique
s = unique(tokens[c("aid", "sentence")])
x = table(s$sentence)
dupes = as.numeric(names(x)[x>1])
x = arrange(unique(tokens[tokens$sentence %in% dupes, c("aid", "sentence")]), sentence)
lag = c(-1, x$sentence[-nrow(x)])
dupe.aids = x$aid[x$sentence == lag]
tokens$sentence[tokens$aid %in% dupe.aids]  = tokens$sentence[tokens$aid %in% dupe.aids] + max(tokens$sentence)

ags = unique(tokens$sentence[tokens$attack])
tokens$ags = tokens$sentence %in% ags
leads = c(tokens$sentence[-1], -1)

# add 'src' on tokens before a say verb
.SAY_VERBS = c("tell", "show", " acknowledge", "admit", "affirm", "allege", "announce", "assert", "attest", "avow", "claim", "comment", "concede", "confirm", "declare", "deny", "exclaim", "insist", "mention", "note", "proclaim", "remark", "report", "say", "speak", "state", "suggest", "talk", "tell", "write", "add")
say = tokens[tokens$lemma %in% .SAY_VERBS, c("sentence", "offset")]
say = aggregate(say["offset"], say["sentence"], min)
say$src = T
tokens = merge(tokens, say, all.x=T)
tokens$src[tokens$sentence != leads] = F
tokens$src = na.locf(tokens$src, fromLast=T)

actors = tokens[tokens$actor != "", c("country", "aid", "sentence", "ags", "offset", "attack", "actor", "src")]
actors = aggregate(actors["offset"], actors[c("country", "aid", "sentence", "ags", "src", "actor")], min)
actors$place = ifelse(actors$src, "source", ifelse(actors$subj, "subj", "obj"))
actors = arrange(actors, sentence, offset)


# table A1: who is quoted 
nns = with(actors[actors$place == "source",], table(actor, country))
nns["Israel",] / nns["Hamas",]

nnsa = with(actors[actors$ags & actors$place == "source",], table(actor, country))
nnsa["Israel",] / nnsa["Hamas",]

cbind(nns, nnsa)[c("Israel","Hamas"), c(2, 4, 1, 3)]


# table A2: who is aggressor / victim


tokens = readRDS("tokens.rds")

# make tokens.sentence unique
s = unique(tokens[c("aid", "sentence")])
x = table(s$sentence)
dupes = as.numeric(names(x)[x>1])
x = arrange(unique(tokens[tokens$sentence %in% dupes, c("aid", "sentence")]), sentence)
lag = c(-1, x$sentence[-nrow(x)])
dupe.aids = x$aid[x$sentence == lag]
tokens$sentence[tokens$aid %in% dupe.aids]  = tokens$sentence[tokens$aid %in% dupe.aids] + max(tokens$sentence)

ags = unique(tokens$sentence[tokens$attack])
tokens$ags = tokens$sentence %in% ags
tokens = tokens[tokens$ags, ]
leads = c(tokens$sentence[-1], -1)

# add 'subj' on tokens before the first aggression and after the source
tokens$src = F
tokens = tokens[tokens$ags, ]
leads = c(tokens$sentence[-1], -1)
attack = tokens[tokens$attack & !tokens$src, c("sentence", "offset")]
attack = aggregate(attack["offset"], attack["sentence"], min)
attack$subj = T
tokens = merge(tokens, attack, all.x=T)
tokens$subj[is.na(tokens$subj) & tokens$sentence != leads] = F
tokens$subj= na.locf(tokens$subj, fromLast=T)

actors = tokens[tokens$actor != "", c("country", "aid", "sentence", "ags", "offset", "attack", "actor", "src", "subj")]
actors = aggregate(actors["offset"], actors[c("country", "aid", "sentence", "ags", "src", "subj", "actor")], min)
actors$place = ifelse(actors$src, "source", ifelse(actors$subj, "subj", "obj"))
actors = arrange(actors, sentence, offset)


nnp = with(actors[actors$ags & actors$place != "source",], table(place, actor, country))
nnp["subj",,"cn"] / colSums(nnp[,,"cn"])
nnp["subj",,"us"] / colSums(nnp[,,"us"])

t = cbind(nnp[,,"us"], nnp[,,"cn"])
tp = prop.table(t, margin = 2)

tp = apply(tp*100,2, round)
tt = cbind(t[, c(2,1,4,3)], tp[, c(2,1,4,3)])[c(2,1), c(1,5,2,6,3,7,4,8)]
rbind(tt, Total=colSums(tt))



# Figure A1
# Note: this is not deterministic, so result might not be identical to published version
# Install with devtools::install_github("kasperwelbers/corpus-tools)
library(corpustools)
# Install with devtools::install_github("kasperwelbers/semnet)
library(semnet)

source("functions.r")

stopwords = read.csv("stop-word-list.csv", header =F, stringsAsFactors=F)[[1]]

preds = tokens[tokens$ags & !tokens$src & !tokens$subj & tokens$pos1 %in% c('V','N','M','A') & !(tokens$lemma %in% stopwords),]
predicates = dtm.create(preds$sentence, preds$lemma, filter.chars = F, minlength = 2)


cusa = unique(tokens$sentence[tokens$country == "us"])
cchina = unique(tokens$sentence[tokens$country == "cn"])

sh = unique(actors$sentence[actors$place == "subj" & actors$ags & actors$actor == "Hamas"])
si = unique(actors$sentence[actors$place == "subj" & actors$ags & actors$actor == "Israel"])

oh = unique(actors$sentence[actors$place == "obj" & actors$ags & actors$actor == "Hamas"])
oi = unique(actors$sentence[actors$place == "obj" & actors$ags & actors$actor == "Israel"])

sboth = intersect(sh, si)
oboth = intersect(oh, oi)
# N
length(intersect(cusa, si))
length(intersect(cchina, si))
length(intersect(cusa, sh))
length(intersect(cchina, sh))


library(semnet)

plot.cooc.from.dtm(predicates, intersect(cusa, si), intersect(cchina, si))
plot.cooc.from.dtm(predicates, intersect(cchina, si), intersect(cusa, si))
plot.cooc.from.dtm(predicates, intersect(cusa, sh), intersect(cchina, sh))
plot.cooc.from.dtm(predicates, intersect(cchina, sh), intersect(cusa, sh))


